Contents

  • Introduction
  • Removing Features

Introduction

This IPython notebook illustrates how to remove features from feature table. First, we need to import py_entitymatching package and other libraries as follows:


In [1]:
# Import py_entitymatching package
import py_entitymatching as em
import os
import pandas as pd


/Users/pradap/miniconda3/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

Then, read the (sample) input tables for blocking purposes


In [3]:
# Get the datasets directory
datasets_dir = em.get_install_path() + os.sep + 'datasets'

# Get the paths of the input tables
path_A = datasets_dir + os.sep + 'person_table_A.csv'
path_B = datasets_dir + os.sep + 'person_table_B.csv'

In [4]:
# Read the CSV files and set 'ID' as the key attribute
A = em.read_csv_metadata(path_A, key='ID')
B = em.read_csv_metadata(path_B, key='ID')

In [5]:
# Get features
feature_table = em.get_features_for_blocking(A, B)

Removing Features from Feature Table


In [6]:
type(feature_table)


Out[6]:
pandas.core.frame.DataFrame

In [9]:
feature_table.head()


Out[9]:
feature_name left_attribute right_attribute left_attr_tokenizer right_attr_tokenizer simfunction function function_source is_auto_generated
0 ID_ID_lev_dist ID ID None None lev_dist <function ID_ID_lev_dist at 0x109a7c048> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
1 ID_ID_lev_sim ID ID None None lev_sim <function ID_ID_lev_sim at 0x11436a158> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
2 ID_ID_jar ID ID None None jaro <function ID_ID_jar at 0x11436a1e0> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
3 ID_ID_jwn ID ID None None jaro_winkler <function ID_ID_jwn at 0x11436a268> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
4 ID_ID_exm ID ID None None exact_match <function ID_ID_exm at 0x11436a510> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True

In [11]:
# Drop first row
feature_table = feature_table.drop(0)

In [12]:
feature_table.head()


Out[12]:
feature_name left_attribute right_attribute left_attr_tokenizer right_attr_tokenizer simfunction function function_source is_auto_generated
1 ID_ID_lev_sim ID ID None None lev_sim <function ID_ID_lev_sim at 0x11436a158> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
2 ID_ID_jar ID ID None None jaro <function ID_ID_jar at 0x11436a1e0> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
3 ID_ID_jwn ID ID None None jaro_winkler <function ID_ID_jwn at 0x11436a268> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
4 ID_ID_exm ID ID None None exact_match <function ID_ID_exm at 0x11436a510> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
5 ID_ID_jac_qgm_3_qgm_3 ID ID qgm_3 qgm_3 jaccard <function ID_ID_jac_qgm_3_qgm_3 at 0x11436a6a8> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True

In [15]:
#Remove all the features except involving name (Include only the features where the left attribute is name)
feature_table = feature_table[feature_table.left_attribute=='name']

In [14]:
feature_table


Out[14]:
feature_name left_attribute right_attribute left_attr_tokenizer right_attr_tokenizer simfunction function function_source is_auto_generated
6 name_name_jac_qgm_3_qgm_3 name name qgm_3 qgm_3 jaccard <function name_name_jac_qgm_3_qgm_3 at 0x11436a730> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
7 name_name_cos_dlm_dc0_dlm_dc0 name name dlm_dc0 dlm_dc0 cosine <function name_name_cos_dlm_dc0_dlm_dc0 at 0x11436a7b8> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
8 name_name_jac_dlm_dc0_dlm_dc0 name name dlm_dc0 dlm_dc0 jaccard <function name_name_jac_dlm_dc0_dlm_dc0 at 0x11436a840> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
9 name_name_mel name name None None monge_elkan <function name_name_mel at 0x11436a8c8> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
10 name_name_lev_dist name name None None lev_dist <function name_name_lev_dist at 0x11436a950> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
11 name_name_lev_sim name name None None lev_sim <function name_name_lev_sim at 0x11436a9d8> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
12 name_name_nmw name name None None needleman_wunsch <function name_name_nmw at 0x11436aa60> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
13 name_name_sw name name None None smith_waterman <function name_name_sw at 0x11436aae8> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True

In [16]:
#Remove all the features except involving jaccard (Include only the features where the sim function is jaccard)
feature_table = feature_table[feature_table.simfunction=='jaccard']

In [17]:
feature_table


Out[17]:
feature_name left_attribute right_attribute left_attr_tokenizer right_attr_tokenizer simfunction function function_source is_auto_generated
6 name_name_jac_qgm_3_qgm_3 name name qgm_3 qgm_3 jaccard <function name_name_jac_qgm_3_qgm_3 at 0x11436a730> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
8 name_name_jac_dlm_dc0_dlm_dc0 name name dlm_dc0 dlm_dc0 jaccard <function name_name_jac_dlm_dc0_dlm_dc0 at 0x11436a840> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True